Analysis following previously published work on Brazilian data, using the benford.analysis R package.
To be completed.
clean_folder <- "~/Dropbox/aaa/studies/oucomo/clean data/"
library(readxl)
library(magrittr)
library(dplyr)
library(tidyr)
library(purrr)
library(benford.analysis)
library(tibble)
library(lubridate)
library(sf)
library(stringr)
library(stringi)
A function to extract the statistics of a Benford analysis we want to look at:
extract_statistics <- function(x) {
with(x, c(with(info, c(n, n.second.order)),
stats$chisq$statistic,
MAD,
MAD.conformity,
distortion.factor,
stats$mantissa.arc.test$statistic))
}
A function to make a table of statistics (in column) for each of the provinces (in row):
make_statistics_table <- function(x) {
x %>%
map_df(extract_statistics) %>%
t() %>%
as.data.frame() %>%
rownames_to_column() %>%
setNames(c("province", "n", "n2", "Chisq", "MAD", "MAD conformity", "DF", "Mantissa")) %>%
as_tibble() %>%
mutate_at(vars(starts_with("n")), as.integer) %>%
mutate_at(vars(MAD, DF, Chisq, Mantissa), as.numeric)
}
A function that plots the diagnostic plots for each of the provinces:
plot_digits_distribution <- function(x, y) {
xmarks <- barplot(x[["bfd"]]$data.dist.freq, col = 4,
xlab = "digits", ylab = "frequency",
ylim = c(0, 1.1 * max(c(x[["bfd"]]$data.dist.freq,
x[["bfd"]]$benford.dist.freq))))
axis(1, at = xmarks, labels = x[["bfd"]]$digits)
lines(xmarks, x[["bfd"]]$benford.dist.freq, lwd = 2, col = 2)
title(y, line = -1)
}
The 2019 census data (Vinh Long is missing):
census <- paste0(clean_folder, "census2019.rds") %>%
readRDS() %>%
group_by(province) %>%
summarise(popsize = sum(n)) %>%
mutate_at("province", stri_trans_general, "Latin-ASCII") %>%
mutate_at("province", str_remove, "Thanh pho |Tinh ") %>%
bind_rows(tibble(province = "Vinh Long", popsize = 1141677))
Downloading the file if not in the folder:
if(!file.exists("gadm36_VNM_1_sf.rds"))
download.file("https://biogeo.ucdavis.edu/data/gadm3.6/Rsf/gadm36_VNM_1_sf.rds",
"gadm36_VNM_1_sf.rds")
Loading the data:
gadm <- "gadm36_VNM_1_sf.rds" %>%
readRDS() %>%
st_set_crs(4326) %>%
transmute(province = VARNAME_1, geometry) %>%
mutate(centroid = map(geometry, st_centroid),
coordinates = map(centroid, st_coordinates),
latitude = map(coordinates, extract2, 2)) %>%
left_join(census, "province")
Reading the data:
rogier <- paste0(clean_folder, "Fourth cluster First wave.xlsx") %>%
read_excel() %>%
rename(date = `...1`)
Getting rid off whatever is below the Total row:
rogier %<>% head(which(rogier$date == "Total") - 1)
Reformatting the data:
rogier %<>%
select(date, `An Giang`:last_col()) %>%
filter(! is.na(date)) %>%
mutate_all(replace_na, 0) %>%
mutate_all(as.integer) %>%
mutate_at("date", as.Date, origin = "1899-12-30")
which looks like:
rogier
## # A tibble: 339 x 64
## date `An Giang` `Ba Ria - Vung Tau` `Bac Giang` `Bac Kan` `Bac Lieu`
## <date> <int> <int> <int> <int> <int>
## 1 2020-01-23 0 0 0 0 0
## 2 2020-02-01 0 0 0 0 0
## 3 2020-02-03 0 0 0 0 0
## 4 2020-02-04 0 0 0 0 0
## 5 2020-02-06 0 0 0 0 0
## 6 2020-02-07 0 0 0 0 0
## 7 2020-02-09 0 0 0 0 0
## 8 2020-02-11 0 0 0 0 0
## 9 2020-02-13 0 0 0 0 0
## 10 2020-03-06 0 0 0 0 0
## # … with 329 more rows, and 58 more variables: Bac Ninh <int>, Ben Tre <int>,
## # Binh Dinh <int>, Binh Duong <int>, Binh Phuoc <int>, Binh Thuan <int>,
## # Ca Mau <int>, Can Tho <int>, Cao Bang <int>, Da Nang <int>, Dak Lak <int>,
## # Dak Nong <int>, Dien Bien <int>, Dong Nai <int>, Dong Thap <int>,
## # Gia Lai <int>, Ha Giang <int>, Ha Nam <int>, Ha Noi <int>, Ha Tinh <int>,
## # Hai Duong <int>, Hai Phong <int>, Hau Giang <int>, Ho Chi Minh <int>,
## # Hoa Binh <int>, Hue <int>, Hung Yen <int>, Khanh Hoa <int>,
## # Kien Giang <int>, Kon Tum <int>, Lai Chau <int>, Lam Dong <int>,
## # Lang Son <int>, Lao Cai <int>, Long An <int>, Nam Dinh <int>,
## # Nghe An <int>, Ninh Binh <int>, Ninh Thuan <int>, Phu Tho <int>,
## # Phu Yen <int>, Quang Binh <int>, Quang Nam <int>, Quang Ngai <int>,
## # Quang Ninh <int>, Quang Tri <int>, Soc Trang <int>, Son La <int>,
## # Tay Ninh <int>, Thai Binh <int>, Thai Nguyen <int>, Thanh Hoa <int>,
## # Tien Giang <int>, Tra Vinh <int>, Tuyen Quang <int>, Vinh Long <int>,
## # Vinh Phuc <int>, Yen Bai <int>
For compatibility of other data sets, we want to rename Hue into Thua Thien Hue:
names(rogier) <- str_replace(names(rogier), "Hue", "Thua Thien Hue")
Data automatically collected from NCSC:
ncsc <- readRDS(paste0(clean_folder, "NCSC data/covid.rds")) %>%
filter(var == "incidence") %>%
select(-var) %>%
mutate_at("province", str_replace, "TP HCM", "Ho Chi Minh")
which looks like this:
ncsc
## # A tibble: 37,189 x 3
## date province n
## <date> <chr> <int>
## 1 2020-03-18 An Giang 0
## 2 2020-03-18 Ba Ria - Vung Tau 0
## 3 2020-03-18 Bac Giang 0
## 4 2020-03-18 Bac Kan 0
## 5 2020-03-18 Bac Lieu 0
## 6 2020-03-18 Bac Ninh 0
## 7 2020-03-18 Ben Tre 0
## 8 2020-03-18 Binh Dinh 0
## 9 2020-03-18 Binh Duong 0
## 10 2020-03-18 Binh Phuoc 0
## # … with 37,179 more rows
A long format of Rogier’s data:
rogier_long <- pivot_longer(rogier, -date, "province", values_to = "n_rogier")
Merging Rogier’s and NCSC’s data:
two_sources <- rogier_long %>%
full_join(ncsc, c("date", "province")) %>%
mutate_at(vars(starts_with("n")), replace_na, 0) %>%
filter(n_rogier > 0 | n > 0)
Plots to compare the two sources of data:
par(mfrow = c(1, 3), plt = c(.15, .9, .17, .9))
two_sources %$%
plot(n_rogier, n, xlab = "Rogier's data", ylab = "NCSC data",
col = adjustcolor(4, .5))
abline(0, 1)
two_sources %$%
plot(n_rogier, n, xlab = "Rogier's data", ylab = "NCSC data",
col = adjustcolor(4, .5), xlim = c(0, 9000), ylim = c(0, 9000))
abline(0, 1)
two_sources %>%
mutate_at(vars(starts_with("n")), ~ .x + .1) %$%
plot(n_rogier, n, xlab = "Rogier's data", ylab = "NCSC data",
col = adjustcolor(4, .5), log = "xy")
abline(0, 1)
Comparing the dates of zero-counts for the two data sets:
two_sources %>%
filter(n < 1) %>%
pull(date) %>%
plot(., jitter(rep(1, length(.))), axes = FALSE, col = 2, ann = FALSE,
xlim = ymd(c(20200101, 20211231)), ylim = c(.8, 1.1))
two_sources %>%
filter(n_rogier < 1) %>%
pull(date) %>%
points(., jitter(rep(.9, length(.))), col = 4)
month_val <- seq(1, 12, 2)
axis(1,
ydm(paste0("2020-1-", month_val), paste0("2021-1-", month_val), "2022-1-1"),
c(paste0(month.abb[month_val], "-20"), paste0(month.abb[month_val], "-21"), "Jan-22"))
month_val2 <- 1:12
abline(v = ydm(paste0("2020-1-", month_val2), paste0("2021-1-", month_val2), "2022-1-1")[-c(1, 2)])
text(ymd(20200115), 1, "NCSC", col = 2)
text(ymd(20200115), .9, "Rogier", col = 4)
The distribution of the counts in one data set when it’s 0 in the other data set:
par(mfrow = c(1, 2), plt = c(.15, .95, .17, .95))
hist2 <- function(..., color) {
hist(..., n = 1000, col = color, border = color, yaxs = "i",
main = NA, ylab = "frequency")
}
two_sources %>%
filter(n_rogier < 1) %>%
pull(n) %>%
hist2(color = 4, xlab = "value in NCSC when 0 count in Rogier")
two_sources %>%
filter(n < 1) %>%
pull(n_rogier) %>%
hist2(color = 2, xlab = "value in Rogier when 0 count in NCSC")
A function to plot the time series from the 2 sources for 1 province:
plot_prov_ts <- function(prov, xlim = ymd(c(20200101, 20220101)), lwd = 1) {
ncsc %>%
filter(province == prov) %$%
plot(date, n, type = "l", col = 2, xlab = NA, ylab = "incidence",
xlim = xlim, lwd = lwd)
rogier_long %>%
filter(province == prov) %$%
lines(date, n_rogier, col = 4, lty = 2, lwd = lwd)
}
Comparing the time series of the two data sets for Ho Chi Minh city:
plot_prov_ts("Ho Chi Minh", ymd(c(20210601, 20220101)), lwd = 2)
legend("topright", legend = c("NCSC", "Rogier"),
col = c(2, 4), lty = 1:2, bty = "n", lwd = 2)
Comparing for all the provinces:
par(mfrow = c(32, 2), plt = c(.2, .97, .17, .9))
ncsc %>%
pull(province) %>%
unique() %>%
sort() %>%
walk(~ {plot_prov_ts(.x); title(.x, line = -1)})
Removing the dates and filtering out the zeros:
rogier1 <- rogier %>%
select(-date) %>%
map(~ .x[.x > 0])
Performing the Benford analysis on each of the 63 provinces:
benford_analyses_rogier1 <- map(rogier1, benford, number.of.digits = 1)
Extracting all these statistics for each of the 63 provinces. Interpretation is as follows:
n: number of data points on which the statistics are calculated.n2: number of data points on which the second order test is performed.Chisq: classical chi-square statistic. Note though that it is highly sensitive to the sample size and tends to reject the null even for small departures from the expected distribution. MAD statistic is thus to be preferred.MAD: Mean Absolute Deviation. The MAD test is more robust since it ignores the number of records. The higher the MAD, the larger the average difference between the observed and theoretical distributions. MAD values above 0.015 suggest nonconformity.MAD conformity: interpretation of the MAD statistic value.DF: Distortion Factor. The DF statistic examines the digit patterns to indicate whether the data appear to be underestimated or overestimated and the deformity’s magnitude.Mantissa: expected to be 0.5. Values less than 0.5 suggest that figures tend to be manipulated downward whereas it’s the opposite for values higher than 0.5.table_rogier1 <- make_statistics_table(benford_analyses_rogier1)
print(table_rogier1, n = Inf)
## # A tibble: 63 x 8
## province n n2 Chisq MAD `MAD conformity` DF Mantissa
## <chr> <int> <int> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 An Giang 143 114 4.38 0.0145 Marginally acceptabl… -13.7 0.00177
## 2 Ba Ria - V… 129 82 16.1 0.0342 Nonconformity -5.98 0.00984
## 3 Bac Giang 123 49 10.5 0.0243 Nonconformity -15.0 0.00331
## 4 Bac Kan 13 2 12.6 0.0996 Nonconformity NaN 0.274
## 5 Bac Lieu 124 68 19.4 0.0373 Nonconformity -16.5 0.0390
## 6 Bac Ninh 139 54 1.29 0.00899 Acceptable conformity -15.3 0.00597
## 7 Ben Tre 125 73 9.08 0.0238 Nonconformity -5.88 0.00707
## 8 Binh Dinh 132 58 17.7 0.0341 Nonconformity -37.5 0.0676
## 9 Binh Duong 147 142 60.0 0.0550 Nonconformity 21.6 0.135
## 10 Binh Phuoc 115 66 10.8 0.0297 Nonconformity -29.9 0.0176
## 11 Binh Thuan 121 86 44.0 0.0543 Nonconformity 7.89 0.139
## 12 Ca Mau 112 68 12.3 0.0336 Nonconformity -9.55 0.0392
## 13 Can Tho 133 90 22.0 0.0364 Nonconformity 9.13 0.00876
## 14 Cao Bang 23 15 8.82 0.0629 Nonconformity -65.1 0.0746
## 15 Da Nang 120 64 7.92 0.0205 Nonconformity -0.793 0.0221
## 16 Dak Lak 106 80 12.2 0.0329 Nonconformity -11.8 0.0500
## 17 Dak Nong 115 45 14.1 0.0312 Nonconformity -4.75 0.0584
## 18 Dien Bien 45 19 5.21 0.0322 Nonconformity -44.4 0.0227
## 19 Dong Nai 151 134 101. 0.0808 Nonconformity 50.6 0.277
## 20 Dong Thap 142 102 35.5 0.0423 Nonconformity 8.72 0.0367
## 21 Gia Lai 109 57 7.18 0.0203 Nonconformity -1.53 0.00845
## 22 Ha Giang 51 45 28.2 0.0797 Nonconformity -6.27 0.202
## 23 Ha Nam 90 31 30.5 0.0594 Nonconformity -45.2 0.104
## 24 Ha Noi 168 78 3.56 0.0127 Marginally acceptabl… -2.41 0.0172
## 25 Ha Tinh 99 29 17.1 0.0409 Nonconformity -36.2 0.00290
## 26 Hai Duong 112 36 15.2 0.0322 Nonconformity -25.8 0.00757
## 27 Hai Phong 45 17 11.2 0.0489 Nonconformity -48.8 0.0412
## 28 Hau Giang 104 62 5.61 0.0190 Nonconformity -19.4 0.00799
## 29 Ho Chi Minh 192 174 23.5 0.0308 Nonconformity -5.19 0.00437
## 30 Hoa Binh 32 15 12.8 0.0520 Nonconformity 5.36 0.000277
## 31 Thua Thien… 103 51 11.3 0.0299 Nonconformity -15.2 0.0444
## 32 Hung Yen 65 23 19.9 0.0547 Nonconformity -42.9 0.112
## 33 Khanh Hoa 137 90 16.8 0.0286 Nonconformity -9.27 0.0351
## 34 Kien Giang 137 103 30.1 0.0392 Nonconformity 6.00 0.0106
## 35 Kon Tum 62 18 6.97 0.0298 Nonconformity -60.5 0.00649
## 36 Lai Chau 16 4 11.4 0.0755 Nonconformity NaN 0.146
## 37 Lam Dong 92 42 8.39 0.0244 Nonconformity -14.7 0.00723
## 38 Lang Son 77 16 3.86 0.0223 Nonconformity -44.8 0.0370
## 39 Lao Cai 49 7 8.75 0.0319 Nonconformity -54.0 0.00321
## 40 Long An 144 113 17.4 0.0282 Nonconformity 12.8 0.0564
## 41 Nam Dinh 69 40 10.7 0.0401 Nonconformity -3.09 0.0493
## 42 Nghe An 124 63 6.02 0.0180 Nonconformity -4.56 0.00912
## 43 Ninh Binh 46 12 7.86 0.0353 Nonconformity -61.7 0.0185
## 44 Ninh Thuan 117 48 20.3 0.0350 Nonconformity -10.4 0.0776
## 45 Phu Tho 65 37 16.6 0.0435 Nonconformity 6.64 0.0671
## 46 Phu Yen 153 47 10.9 0.0260 Nonconformity -37.7 0.0181
## 47 Quang Binh 104 49 9.00 0.0242 Nonconformity -28.6 0.00553
## 48 Quang Nam 108 51 8.04 0.0237 Nonconformity -6.50 0.0171
## 49 Quang Ngai 119 47 15.9 0.0285 Nonconformity -26.9 0.00436
## 50 Quang Ninh 54 27 13.2 0.0432 Nonconformity -32.6 0.0500
## 51 Quang Tri 88 27 3.57 0.0164 Nonconformity -48.3 0.0144
## 52 Soc Trang 82 67 6.18 0.0261 Nonconformity -11.6 0.000278
## 53 Son La 66 18 15.0 0.0454 Nonconformity -60.9 0.0710
## 54 Tay Ninh 131 110 20.4 0.0307 Nonconformity 6.56 0.0342
## 55 Thai Binh 62 26 5.63 0.0248 Nonconformity 20.6 0.0784
## 56 Thai Nguyen 38 16 5.65 0.0306 Nonconformity 24.2 0.0156
## 57 Thanh Hoa 117 48 9.67 0.0291 Nonconformity -3.92 0.0191
## 58 Tien Giang 139 118 8.39 0.0229 Nonconformity -5.77 0.00817
## 59 Tra Vinh 119 76 9.03 0.0239 Nonconformity -9.04 0.00150
## 60 Tuyen Quang 40 11 54.4 0.0942 Nonconformity -26.6 0.178
## 61 Vinh Long 137 74 24.1 0.0307 Nonconformity -8.72 0.0264
## 62 Vinh Phuc 60 31 8.82 0.0378 Nonconformity -22.6 0.0265
## 63 Yen Bai 28 8 18.5 0.0844 Nonconformity -49.9 0.206
Plotting the diagnostic plots for each of the 63 provinces:
par(mfrow = c(16, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_rogier1,
names(benford_analyses_rogier1),
plot_digits_distribution)
Same analysis as above but with data from January 2021 only:
rogier2 <- rogier %>%
filter(date > ymd(20210101)) %>%
select(-date) %>%
map(~ .x[.x > 0])
benford_analyses_rogier2 <- map(rogier2, benford, number.of.digits = 1)
table_rogier2 <- make_statistics_table(benford_analyses_rogier2)
print(table_rogier2, n = Inf)
## # A tibble: 63 x 8
## province n n2 Chisq MAD `MAD conformity` DF Mantissa
## <chr> <int> <int> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 An Giang 143 114 4.38 0.0145 Marginally acceptabl… -13.7 0.00177
## 2 Ba Ria - V… 129 82 16.1 0.0342 Nonconformity -5.98 0.00984
## 3 Bac Giang 123 49 10.5 0.0243 Nonconformity -15.0 0.00331
## 4 Bac Kan 13 2 12.6 0.0996 Nonconformity NaN 0.274
## 5 Bac Lieu 124 68 19.4 0.0373 Nonconformity -16.5 0.0390
## 6 Bac Ninh 139 54 1.29 0.00899 Acceptable conformity -15.3 0.00597
## 7 Ben Tre 125 73 9.08 0.0238 Nonconformity -5.88 0.00707
## 8 Binh Dinh 132 58 17.7 0.0341 Nonconformity -37.5 0.0676
## 9 Binh Duong 147 142 60.0 0.0550 Nonconformity 21.6 0.135
## 10 Binh Phuoc 115 66 10.8 0.0297 Nonconformity -29.9 0.0176
## 11 Binh Thuan 121 86 44.0 0.0543 Nonconformity 7.89 0.139
## 12 Ca Mau 112 68 12.3 0.0336 Nonconformity -9.55 0.0392
## 13 Can Tho 133 90 22.0 0.0364 Nonconformity 9.13 0.00876
## 14 Cao Bang 23 15 8.82 0.0629 Nonconformity -65.1 0.0746
## 15 Da Nang 119 64 7.73 0.0203 Nonconformity -0.793 0.0203
## 16 Dak Lak 106 80 12.2 0.0329 Nonconformity -11.8 0.0500
## 17 Dak Nong 115 45 14.1 0.0312 Nonconformity -4.75 0.0584
## 18 Dien Bien 45 19 5.21 0.0322 Nonconformity -44.4 0.0227
## 19 Dong Nai 151 134 101. 0.0808 Nonconformity 50.6 0.277
## 20 Dong Thap 142 102 35.5 0.0423 Nonconformity 8.72 0.0367
## 21 Gia Lai 109 57 7.18 0.0203 Nonconformity -1.53 0.00845
## 22 Ha Giang 51 45 28.2 0.0797 Nonconformity -6.27 0.202
## 23 Ha Nam 90 31 30.5 0.0594 Nonconformity -45.2 0.104
## 24 Ha Noi 165 78 3.32 0.0117 Acceptable conformity -2.41 0.0165
## 25 Ha Tinh 99 29 17.1 0.0409 Nonconformity -36.2 0.00290
## 26 Hai Duong 112 36 15.2 0.0322 Nonconformity -25.8 0.00757
## 27 Hai Phong 45 17 11.2 0.0489 Nonconformity -48.8 0.0412
## 28 Hau Giang 104 62 5.61 0.0190 Nonconformity -19.4 0.00799
## 29 Ho Chi Minh 188 174 23.6 0.0316 Nonconformity -5.19 0.00345
## 30 Hoa Binh 32 15 12.8 0.0520 Nonconformity 5.36 0.000277
## 31 Thua Thien… 103 51 11.3 0.0299 Nonconformity -15.2 0.0444
## 32 Hung Yen 65 23 19.9 0.0547 Nonconformity -42.9 0.112
## 33 Khanh Hoa 137 90 16.8 0.0286 Nonconformity -9.27 0.0351
## 34 Kien Giang 137 103 30.1 0.0392 Nonconformity 6.00 0.0106
## 35 Kon Tum 62 18 6.97 0.0298 Nonconformity -60.5 0.00649
## 36 Lai Chau 16 4 11.4 0.0755 Nonconformity NaN 0.146
## 37 Lam Dong 92 42 8.39 0.0244 Nonconformity -14.7 0.00723
## 38 Lang Son 77 16 3.86 0.0223 Nonconformity -44.8 0.0370
## 39 Lao Cai 49 7 8.75 0.0319 Nonconformity -54.0 0.00321
## 40 Long An 144 113 17.4 0.0282 Nonconformity 12.8 0.0564
## 41 Nam Dinh 69 40 10.7 0.0401 Nonconformity -3.09 0.0493
## 42 Nghe An 124 63 6.02 0.0180 Nonconformity -4.56 0.00912
## 43 Ninh Binh 46 12 7.86 0.0353 Nonconformity -61.7 0.0185
## 44 Ninh Thuan 117 48 20.3 0.0350 Nonconformity -10.4 0.0776
## 45 Phu Tho 65 37 16.6 0.0435 Nonconformity 6.64 0.0671
## 46 Phu Yen 153 47 10.9 0.0260 Nonconformity -37.7 0.0181
## 47 Quang Binh 104 49 9.00 0.0242 Nonconformity -28.6 0.00553
## 48 Quang Nam 108 51 8.04 0.0237 Nonconformity -6.50 0.0171
## 49 Quang Ngai 119 47 15.9 0.0285 Nonconformity -26.9 0.00436
## 50 Quang Ninh 54 27 13.2 0.0432 Nonconformity -32.6 0.0500
## 51 Quang Tri 88 27 3.57 0.0164 Nonconformity -48.3 0.0144
## 52 Soc Trang 82 67 6.18 0.0261 Nonconformity -11.6 0.000278
## 53 Son La 66 18 15.0 0.0454 Nonconformity -60.9 0.0710
## 54 Tay Ninh 131 110 20.4 0.0307 Nonconformity 6.56 0.0342
## 55 Thai Binh 62 26 5.63 0.0248 Nonconformity 20.6 0.0784
## 56 Thai Nguyen 38 16 5.65 0.0306 Nonconformity 24.2 0.0156
## 57 Thanh Hoa 117 48 9.67 0.0291 Nonconformity -3.92 0.0191
## 58 Tien Giang 139 118 8.39 0.0229 Nonconformity -5.77 0.00817
## 59 Tra Vinh 119 76 9.03 0.0239 Nonconformity -9.04 0.00150
## 60 Tuyen Quang 40 11 54.4 0.0942 Nonconformity -26.6 0.178
## 61 Vinh Long 137 74 24.1 0.0307 Nonconformity -8.72 0.0264
## 62 Vinh Phuc 60 31 8.82 0.0378 Nonconformity -22.6 0.0265
## 63 Yen Bai 28 8 18.5 0.0844 Nonconformity -49.9 0.206
par(mfrow = c(16, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_rogier2,
names(benford_analyses_rogier2),
plot_digits_distribution)
Let’s compare with the table we get with all the data:
vars <- setdiff(names(table_rogier1), c("province", "MAD conformity"))
par(mfrow = c(2, 3), plt = c(.2, .97, .17, .9), cex = 1)
walk(vars, ~ {
plot(table_rogier1[[.x]], table_rogier2[[.x]], col = 4,
xlab = "all the data", ylab = "from Jan 2021")
abline(0, 1)
title(.x, line = -1)})
group1 <- rogier %>%
transmute(date, n = `Hai Duong` + `Quang Ninh`) %>%
filter(ymd(20210101) < date, date < ymd(20210401)) %>%
select(-date) %>%
map(~ .x[.x > 0])
group2 <- rogier %>%
transmute(date, n = `Bac Giang` + `Bac Ninh`) %>%
filter(ymd(20210401) < date, date < ymd(20210801)) %>%
select(-date) %>%
map(~ .x[.x > 0])
The fourth wave:
wave4 <- rogier %>%
filter(ymd(20210501) < date, date < ymd(20211015)) %>%
select(-date)
Merging with GADM data:
wave4gadm <- wave4 %>%
colSums() %>%
list() %>%
data.frame() %>%
setNames("n") %>%
rownames_to_column("province") %>%
left_join(gadm, ., "province") %>%
mutate(incidence = 100000 * n / popsize)
The south wave in the south:
wave4mekong <- wave4gadm %>%
filter(latitude < 12, incidence > 50)
Which looks like this (in red):
par(plt = c(0, 1, 0, 1))
gadm %>%
st_geometry() %>%
plot(col = "grey")
wave4mekong %>%
st_geometry() %>%
plot(add = TRUE, col = "red")
Group 3 is then:
group3 <- wave4 %>%
magrittr::extract(wave4mekong$province) %>%
rowSums() %>%
list()
group4 <- rogier %>%
filter(date > ymd(20211015)) %>%
select(-date) %>%
rowSums() %>%
list()
Let’s group the 4 groups into a list:
groups <- c(group1, group2, group3, group4) %>%
setNames(paste0("group", 1:4))
on which we can rerun the previous analyses:
benford_analyses_rogier_4groups <- map(groups, benford, number.of.digits = 1)
make_statistics_table(benford_analyses_rogier_4groups)
## # A tibble: 4 x 8
## province n n2 Chisq MAD `MAD conformity` DF Mantissa
## <chr> <int> <int> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 group1 41 16 17.6 0.0465 Nonconformity -30.7 0.00933
## 2 group2 62 37 14.0 0.0481 Nonconformity -2.84 0.0852
## 3 group3 137 125 8.38 0.0182 Nonconformity 3.28 0.0359
## 4 group4 49 48 21.8 0.0563 Nonconformity 25.9 0.113
par(mfrow = c(1, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_rogier_4groups,
names(benford_analyses_rogier_4groups),
plot_digits_distribution)
ncsc_wide <- pivot_wider(ncsc, names_from = province, values_from = n)
ncsc1 <- ncsc_wide %>%
select(-date) %>%
map(~ .x[.x > 0])
benford_analyses_ncsc1 <- map(ncsc1, benford, number.of.digits = 1)
table_ncsc1 <- make_statistics_table(benford_analyses_ncsc1)
print(table_ncsc1, n = Inf)
## # A tibble: 63 x 8
## province n n2 Chisq MAD `MAD conformity` DF Mantissa
## <chr> <int> <int> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 An Giang 161 117 9.71 0.0236 Nonconformity -14.8 0.00693
## 2 Ba Ria - Vu… 175 89 24.9 0.0389 Nonconformity -9.39 0.0360
## 3 Bac Giang 157 68 22.3 0.0358 Nonconformity -17.9 0.0398
## 4 Bac Kan 20 2 25.6 0.111 Nonconformity NaN 0.521
## 5 Bac Lieu 145 67 16.4 0.0326 Nonconformity -18.4 0.0136
## 6 Bac Ninh 173 64 10.5 0.0193 Nonconformity -8.73 0.000392
## 7 Ben Tre 129 72 9.03 0.0223 Nonconformity -3.03 0.0183
## 8 Binh Dinh 139 58 17.8 0.0361 Nonconformity -37.9 0.0616
## 9 Binh Duong 187 153 29.1 0.0301 Nonconformity 14.8 0.0497
## 10 Binh Phuoc 130 67 7.58 0.0225 Nonconformity -29.5 0.00835
## 11 Binh Thuan 143 91 34.0 0.0468 Nonconformity 10.5 0.110
## 12 Ca Mau 127 68 9.68 0.0262 Nonconformity -10.8 0.0192
## 13 Can Tho 151 96 19.9 0.0328 Nonconformity 10.0 0.0172
## 14 Cao Bang 22 14 10.7 0.0704 Nonconformity -65.1 0.0963
## 15 Da Nang 241 73 20.2 0.0303 Nonconformity -9.88 0.0332
## 16 Dak Lak 113 79 11.9 0.0306 Nonconformity -11.5 0.0452
## 17 Dak Nong 124 44 17.3 0.0328 Nonconformity -6.79 0.0689
## 18 Dien Bien 53 19 9.85 0.0446 Nonconformity -46.0 0.0593
## 19 Dong Nai 170 137 80.0 0.0677 Nonconformity 52.6 0.285
## 20 Dong Thap 165 110 16.3 0.0271 Nonconformity 7.47 0.0217
## 21 Gia Lai 130 57 8.33 0.0232 Nonconformity -4.93 0.0112
## 22 Ha Giang 60 47 26.3 0.0710 Nonconformity -4.83 0.185
## 23 Ha Nam 115 32 37.4 0.0582 Nonconformity -44.4 0.0699
## 24 Ha Noi 269 89 14.7 0.0245 Nonconformity -9.59 0.0202
## 25 Ha Tinh 139 28 9.86 0.0245 Nonconformity -42.2 0.00962
## 26 Hai Duong 162 40 20.3 0.0320 Nonconformity -30.2 0.0155
## 27 Hai Phong 62 16 13.9 0.0454 Nonconformity -50.8 0.0302
## 28 Hau Giang 110 64 10.2 0.0273 Nonconformity -24.6 0.00171
## 29 Hoa Binh 49 15 14.7 0.0480 Nonconformity 10.0 0.0121
## 30 Hung Yen 122 23 28.9 0.0502 Nonconformity -47.6 0.0933
## 31 Khanh Hoa 197 101 20.0 0.0286 Nonconformity -16.8 0.0216
## 32 Kien Giang 172 112 25.2 0.0309 Nonconformity 5.99 0.0158
## 33 Kon Tum 66 17 9.27 0.0361 Nonconformity -60.6 0.00598
## 34 Lai Chau 18 4 11.5 0.0792 Nonconformity NaN 0.171
## 35 Lam Dong 106 43 11.0 0.0252 Nonconformity -21.0 0.00596
## 36 Lang Son 99 16 11.2 0.0323 Nonconformity -49.3 0.0542
## 37 Lao Cai 61 10 9.10 0.0364 Nonconformity -65.1 0.0125
## 38 Long An 170 118 27.0 0.0316 Nonconformity 20.2 0.0963
## 39 Nam Dinh 87 40 9.81 0.0272 Nonconformity -3.10 0.0170
## 40 Nghe An 147 65 9.71 0.0246 Nonconformity -10.8 0.0156
## 41 Ninh Binh 76 15 7.51 0.0300 Nonconformity -59.4 0.0430
## 42 Ninh Thuan 140 51 13.8 0.0268 Nonconformity -14.9 0.0583
## 43 Phu Tho 76 38 10.5 0.0325 Nonconformity 6.46 0.0201
## 44 Phu Yen 160 59 5.14 0.0150 Marginally acceptable… -18.4 0.000545
## 45 Quang Binh 114 49 6.62 0.0191 Nonconformity -30.4 0.00360
## 46 Quang Nam 174 51 18.2 0.0318 Nonconformity -18.7 0.0478
## 47 Quang Ngai 150 48 10.3 0.0252 Nonconformity -30.2 0.000540
## 48 Quang Ninh 82 29 15.2 0.0397 Nonconformity -33.3 0.0194
## 49 Quang Tri 101 27 4.97 0.0202 Nonconformity -48.3 0.0198
## 50 Soc Trang 95 72 5.30 0.0210 Nonconformity -14.0 0.00126
## 51 Son La 68 18 16.0 0.0439 Nonconformity -60.9 0.0686
## 52 Tay Ninh 182 119 12.8 0.0248 Nonconformity 0.376 0.0231
## 53 Thai Binh 93 27 9.47 0.0326 Nonconformity 20.3 0.0526
## 54 Thai Nguyen 50 15 14.5 0.0515 Nonconformity 11.6 0.0939
## 55 Thanh Hoa 142 49 4.00 0.0160 Nonconformity -8.51 0.00581
## 56 Thua Thien … 121 50 15.8 0.0344 Nonconformity -15.8 0.0538
## 57 Tien Giang 153 124 9.03 0.0231 Nonconformity -4.54 0.00698
## 58 Ho Chi Minh 279 182 25.5 0.0289 Nonconformity 3.00 0.0161
## 59 Tra Vinh 138 77 7.79 0.0192 Nonconformity -3.34 0.00382
## 60 Tuyen Quang 41 11 63.2 0.102 Nonconformity -26.8 0.190
## 61 Vinh Long 140 71 29.7 0.0322 Nonconformity -7.78 0.0315
## 62 Vinh Phuc 86 32 7.31 0.0272 Nonconformity -28.8 0.0418
## 63 Yen Bai 31 7 21.5 0.0875 Nonconformity -49.5 0.235
par(mfrow = c(16, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_ncsc1,
names(benford_analyses_ncsc1),
plot_digits_distribution)
ncsc_group1 <- ncsc_wide %>%
transmute(date, n = `Hai Duong` + `Quang Ninh`) %>%
filter(ymd(20210101) < date, date < ymd(20210401)) %>%
select(-date) %>%
map(~ .x[.x > 0])
ncsc_group2 <- ncsc_wide %>%
transmute(date, n = `Bac Giang` + `Bac Ninh`) %>%
filter(ymd(20210401) < date, date < ymd(20210801)) %>%
select(-date) %>%
map(~ .x[.x > 0])
ncsc_group3 <- ncsc_wide %>%
filter(ymd(20210501) < date, date < ymd(20211015)) %>%
select(-date) %>%
magrittr::extract(wave4mekong$province) %>%
rowSums() %>%
list()
ncsc_group4 <- ncsc_wide %>%
filter(date > ymd(20211015)) %>%
select(-date) %>%
rowSums() %>%
list()
ncsc_groups <- c(ncsc_group1, ncsc_group2, ncsc_group3, ncsc_group4) %>%
setNames(paste0("group", 1:4))
benford_analyses_ncsc_4groups <- map(ncsc_groups, benford, number.of.digits = 1)
make_statistics_table(benford_analyses_ncsc_4groups)
## # A tibble: 4 x 8
## province n n2 Chisq MAD `MAD conformity` DF Mantissa
## <chr> <int> <int> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 group1 48 24 8.28 0.0372 Nonconformity -35.6 0.0317
## 2 group2 81 55 16.0 0.0386 Nonconformity -29.7 0.0369
## 3 group3 156 136 36.7 0.0419 Nonconformity 18.2 0.156
## 4 group4 48 47 25.8 0.0637 Nonconformity 26.9 0.120
par(mfrow = c(1, 4), plt = c(.2, .97, .17, .9))
walk2(benford_analyses_ncsc_4groups,
names(benford_analyses_ncsc_4groups),
plot_digits_distribution)